Load required Libraries
rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(magrittr)
library(pcaPP)
library(directlabels)
library(proto)
Load in Wordbank tata
## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname='wordbank',host="54.149.39.46",
user="wordbank",password="wordbank")
## NOW LOAD TABLES ##
source.table <- tbl(wordbank,"common_source")
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wordmapping.table <- tbl(wordbank,"common_wordmapping")
instruments.table <- tbl(wordbank,"common_instrumentsmap")
english.ws.table <- tbl(wordbank,"instruments_english_ws")
spanish.ws.table <- tbl(wordbank,"instruments_spanish_ws")
norwegian.ws.table <- tbl(wordbank,"instruments_norwegian_ws")
danish.ws.table <- tbl(wordbank,"instruments_danish_ws")
Get kid data and put together.
# Get administration info
admins <- admin.table %>%
select(data_id,child_id,age,source_id) %>%
rename(id = data_id, child.id = child_id, source.id = source_id)
admins <- as.data.frame(admins)
# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
rename(child.id = id) # Rename id fields
demos <- as.data.frame(demos)
# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))
Set up mappings and instruments.
mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
rename(instrument_id = id)
items <- left_join(mapping, instruments)
Fucntion for getting all of the data in wordbank for a given language (kid x item).
get.language.data <- function(lang.table, lang.items, lang, child.data) {
instrument.items <- lang.items %>%
filter(language == lang, form == 'WS') %>%
select(item, type, category, lexical_category) %>%
mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
instrument.data <- as.data.frame(lang.table) %>%
rename(id = basetable_ptr_id) %>% # Rename the id
gather(item, value, -id) %>% # Arrange in longform
mutate(item = str_replace(item, "item_", "")) # Strip off item_
d <- left_join(instrument.data, instrument.items)
d <- left_join(d, child.data)
}
Get kid x item data for all languages.
d.english <- get.language.data(lang.table=english.ws.table,
lang.items=items,
lang="English",
child.data)
d.spanish <- get.language.data(lang.table=spanish.ws.table,
lang.items=items,
lang="Spanish",
child.data)
d.norwegian <- get.language.data(lang.table=norwegian.ws.table,
lang.items=items,
lang="Norwegian",
child.data)
# Norwegian data is loaded in funny -- NAs in wordform are actually 0s
d.norwegian[d.norwegian$type %in% c("word_form","word")
& is.na(d.norwegian$value),]$value = ""
d.danish <- get.language.data(lang.table=danish.ws.table,
lang.items=items,
lang="Danish",
child.data)
# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
& is.na(d.danish$value),]$value = ""
Function for getting vocab size data.
language.vocab.sizes <- function(lang.data) {
d.vocab <- lang.data %>%
filter(type == "word") %>%
group_by(age,id) %>%
summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
vocab.mean = mean(value == "produces", na.rm=TRUE))
return(d.vocab)
}
Function for getting kid x {vocab size, syntax score, morphology score} data.
summarise.language.data <- function(lang.data,lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.complexity <- lang.data %>%
filter(type == "complexity") %>%
group_by(age,id) %>%
summarise(num.complexity.na = sum(is.na(value)),
complexity = mean(value == "complex", na.rm=TRUE))
d.wordform <- lang.data %>%
filter(type == "word_form") %>%
group_by(age,id) %>%
summarise(wordform = mean(value == "produces", na.rm=TRUE))
# Spanish doesn't have ending data, so its skipped, at least for now.
# d.ending <- d %>%
# filter(type %in% c("ending")) %>%
# group_by(id) %>%
# summarise(ending_sometimes = mean(value == "sometimes" |
# value == "often",
# na.rm=TRUE),
# ending_often = mean(value == "often",
# na.rm=TRUE))
# d.composite <- left_join(d.composite, d.ending)
d.composite <- left_join(d.vocab, d.complexity)
d.composite <- left_join(d.composite, d.wordform) %>%
filter(num.complexity.na == 0) %>%
select(-num.complexity.na)
d.composite$language <- lang
return(d.composite)
}
Get kid x {vocab size, syntax score, morphology score} data for all languages and aggregate them.
summary.english <- summarise.language.data(d.english,"English")
summary.spanish <- summarise.language.data(d.spanish,"Spanish")
summary.norwegian <- summarise.language.data(d.norwegian,"Norwegian")
summary.danish <- summarise.language.data(d.danish,"Danish")
summary.data <- rbind_list(summary.english,summary.spanish,
summary.norwegian,summary.danish) %>%
filter(age > 15 & age < 31) %>%
mutate(age.group = cut(age, breaks = c(15, 20, 25, 30)),
language = factor(language,
levels=c("English", "Spanish", "Norwegian", "Danish")))
# gather for plotting
ms <- summary.data %>% gather(measure, score, complexity:wordform) %>%
mutate(measure = factor(measure, levels = c("wordform","complexity"),
labels = c("Word Form", "Complexity")),
s.vocab = scale(vocab.mean),
s.age = scale(age))
Using Age and Vocab to predict Morphology and Syntax Scores.
#quartz(width=8,height=7.5)
ggplot(ms,aes(x = vocab.mean, y = score, colour = age.group, fill = age.group,
label = age.group)) +
geom_jitter(size=.8)+
geom_smooth(method="lm", formula = y ~ I(x^2)) +
facet_grid(language~measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
name = "Vocabulary Size") +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") +
theme_bw(base_size = 14) +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1")
Using Morphology scores to Predict Syntax scores.
#quartz()
ggplot(summary.data,aes(x = wordform, y = complexity, fill=age.group,colour=age.group,
label=age.group)) +
facet_wrap( ~ language) +
geom_jitter(size=1)+
geom_smooth(method="lm", formula = y ~ x) +
scale_x_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),name = "Morphology Score") +
scale_y_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),"Syntax Score") +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1") +
theme_bw(base_size = 14)
Function for computing vocabulary composition for each speaker of a language.
vocab.composition <- function(lang.data,lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.cat <- lang.data %>%
filter(type == "word") %>%
group_by(id,lexical_category) %>%
summarise(cat = sum(value == "produces", na.rm=TRUE))
d.vocab.comp <- left_join(d.vocab, d.cat) %>%
mutate(prop = cat / vocab.sum) %>%
select(-cat)
d.vocab.comp$language = lang
return(d.vocab.comp)
}
Function for computing CDI form composition for all languages.
lang.vocab.composition <- function(lang.items) {
lang.words <- lang.items %>%
filter(form == "WS",type=="word")
lang.num.total <- lang.words %>%
group_by(language) %>%
summarise(n = n())
lang.vocab.comp <- lang.words %>%
group_by(language,lexical_category) %>%
summarise(num.per.cat = n())
lang.vocab.comp <- left_join(lang.vocab.comp, lang.num.total) %>%
mutate(prop.per.cat = num.per.cat/n)
return(lang.vocab.comp)
}
Get vocabulary composition data for all languages.
# get form compositions
lang.vocab.comp <- lang.vocab.composition(items) %>%
filter(lexical_category != "other")
# get data for kids in each language
vocab.comp.english <- vocab.composition(d.english,"English")
vocab.comp.spanish <- vocab.composition(d.spanish,"Spanish")
vocab.comp.norwegian <- vocab.composition(d.norwegian,"Norwegian")
vocab.comp.danish <- vocab.composition(d.danish,"Danish")
# aggregate data for all languages together
summary.vocab.comp <- rbind_list(vocab.comp.english,vocab.comp.spanish,
vocab.comp.norwegian,vocab.comp.danish) %>%
filter(age > 15 & age < 31) %>%
mutate(age.group = cut(age, breaks = c(15, 20, 25, 30)),
language = factor(language,
levels=c("English", "Spanish",
"Norwegian", "Danish")),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates",
"function_words", "other"),
labels = c("Nouns", "Predicates",
"Function Words", "Other")))
Plot vocabulary composition by language.
ggplot(filter(summary.vocab.comp,lexical_category != "Function Words"),
aes(x=vocab.mean, y=prop, colour=lexical_category,
shape = lexical_category, fill = lexical_category,
label=lexical_category)) +
geom_point(size = 1, alpha = 0.25) +
facet_wrap(~ language) +
geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
(Old stuff that’s being kept around for possible future use.)
#```{r,fig.width=12,fig.height=7.5}
#Fit regressions to data
#t.lm1 <- lm(score ~ age + measure, data=ms)
#t.lm2 <- lm(score ~ I(vocab^2)*measure + age*measure, data=ms)
#t.lm3 <- lm(score ~ I(vocab^2)*measure*age.binned, data=ms)
#t.lm4 <- lm(score ~ I(vocab^2)*measure*age, data=ms)
#ms$predicted <- predict.lm(t.lm3,ms)
# Plot by age
#ggplot(ms,aes(x = vocab, y = score, colour = measure,label=measure))+
# facet_wrap(~ age)+
# geom_jitter(size=1)+
# geom_line(aes(y=predicted),size=.5)+
# scale_color_brewer(palette="Set1") +
# scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") +
# scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") +
# theme_bw(base_size = 14)
#```
#Replot original correlation with fitted model
#```{r,fig.width=12,fig.height=7.5}
#ggplot(ms,aes(x = vocab, y = score, colour = age.binned, fill = age.binned,
# label = age.binned)) +
# geom_jitter(size=1.5)+
# geom_line(aes(y=predicted),size=1) +
# facet_wrap(~measure) +
# scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") +
# scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") +
# theme_bw(base_size = 14) +
# scale_color_brewer(palette="Set1") +
# scale_fill_brewer(palette="Set1")
#```
#Compute some descriptives on syntactic items
#```{r,fig.width=7,fig.height=4}
# compute Kendall's tau -- cor.fk is a faster implementation than in stats::cor
#complex.cors <- cor.fk(as.matrix(bykid.syntax.vocab[,8:ncol(bykid.syntax.vocab)])) %>%
# as.data.frame
#names(complex.cors) <- str_replace(names(complex.cors),"complx","")
#row.names(complex.cors) <- str_replace(row.names(complex.cors),"complx","")
# make a dendrogram of the complex item similarities
#complex.dendro <- as.dendrogram(hclust(dist(complex.cors)))
#plot(complex.dendro)
#```
#Make a confusion matrix
#```{r,fig.width=8,fig.height=5}
# gather the columns for plotting as a confusion matrix
#complex.cors %<>%
# mutate(prompt = factor(row.names(complex.cors))) %>%
# gather(response,correlation,"01":"37")
#ggplot(complex.cors, aes(response, prompt)) +
# geom_tile(aes(fill = correlation)) +
# ylim(rev(levels(complex.cors$prompt))) +
# scale_fill_gradient(low = "white", high = "black",guide=FALSE) +
# labs(x="Response", y = "Prompt") +
# theme(legend.position = "none", axis.ticks = element_blank()) +
# theme_bw(base_size = 16)
#```
#Compute vocab x age interaction terms for each styntactic item
#```{r,fig.height=5,fig.width=8}
# write regression formulas for separately for each item
#formulas <- sapply(names(bykid.syntax.vocab)[8:ncol(bykid.syntax.vocab)],
# function(x) paste(x ,"~ I(vocab^2)*age + 0",collapse=""))
# compute interaction terms each item
#interaction.terms <- sapply(formulas, function(x)
# summary(glm(as.formula(x),data=bykid.syntax.vocab,
# family="binomial"))$coefficients[3,3])
#names(interaction.terms) <- 1:37
#rename results to be human-readable
#interaction.terms <- as.data.frame(interaction.terms) %>%
# mutate(item = 1:37) %>%
# rename(zscore = interaction.terms) %>%
# arrange(zscore) %>%
# mutate(item = factor(item,levels=item))
# plot interaction terms by item
#ggplot(interaction.terms,
# aes(x=item,y=zscore,fill=1))+
# geom_bar(stat="identity")+
# geom_hline(yintercept=mean(interaction.terms$zscore),
# lty=2)+
# theme_bw(base_size = 14) +
# scale_y_continuous(name="vocabulary size x age z-score",limits=c(0,15),
# breaks=seq(0,15,2.5))+
# scale_x_discrete(name="complexity CDI item")+
# scale_color_brewer(palette="Set1") +
# theme(legend.position="none")
#```
#Leftover analyses
#```{r}
#summary(lm(syntax ~ I(vocab^2) * age - 1, data=d))
#summary(lm(morpho ~ I(vocab^2) * age - 1, data=d))
#summary(lm(syntax ~ I(d$s.vocab^2) * d$age_bin - 1, data=d))
#summary(lm(morpho ~ I(d$s.vocab^2) * d$age_bin - 1, data=d))
#```